#importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


#loading the datasets
df1 = pd.read_csv('datatest.csv')
df2 = pd.read_csv('datatraining.csv')


#combining the datasets
df = pd.concat([df1,df2])
df.head()


#number of rows and columns
df.shape

(10808, 7)


#checking for null values
df.isnull().sum()

date             0
Temperature      0
Humidity         0
Light            0
CO2              0
HumidityRatio    0
Occupancy        0
dtype: int64


#checking for duplicate values
df.duplicated().sum()

27


#removing the duplicate values
df.drop_duplicates(inplace=True)


#checking data types
df.dtypes

date              object
Temperature      float64
Humidity         float64
Light            float64
CO2              float64
HumidityRatio    float64
Occupancy          int64
dtype: object


#converting the date and time to datetime format
df['date'] = pd.to_datetime(df['date'])


df.dtypes

date             datetime64[ns]
Temperature             float64
Humidity                float64
Light                   float64
CO2                     float64
HumidityRatio           float64
Occupancy                 int64
dtype: object


#checking the descriptive statistics
df.describe()


#value counts for the target variable i.e. occupancy
df['Occupancy'].value_counts()

Occupancy
0    8080
1    2701
Name: count, dtype: int64


#lineplot for themperature changes for time
plt.figure(figsize=(20,10))
sns.lineplot(x='date',y='Temperature',data=df)
plt.show()


#lineplot for humidity changes for time
plt.figure(figsize=(20,10))
sns.lineplot(x='date',y='Humidity',data=df)
plt.show()


#lineplot for light changes for time
plt.figure(figsize=(20,10))
sns.lineplot(x='date',y='Light',data=df)
plt.show()


#lineplot for co2 changes for time
plt.figure(figsize=(20,10))
sns.lineplot(x='date',y='CO2',data=df)
plt.show()


#lineplot for humidity ratio changes for time
plt.figure(figsize=(20,10))
sns.lineplot(x='date',y='HumidityRatio',data=df)
plt.show()


#correlation heatmap
plt.figure(figsize=(20,10))
sns.heatmap(df.corr(),annot=True)
plt.show()


#violinplot for temperature
sns.violinplot(y = df['Temperature'],x = df['Occupancy'])
plt.xlabel('Occupancy')
plt.ylabel('Temperature')
plt.show()


#boxplot for light
sns.boxplot(y = df['Light'],x = df['Occupancy'])
plt.xlabel('Occupancy')
plt.ylabel('Light')
plt.show()


#violinlot for co2
sns.violinplot(y = df['CO2'],x = df['Occupancy'])
plt.xlabel('Occupancy')
plt.ylabel('CO2')
plt.show()


#dropping columns humidity, date and humidity ratio
df.drop(['Humidity','date','HumidityRatio'],axis=1,inplace=True)


df.head(10)


from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(df.drop(['Occupancy'],axis=1),df['Occupancy'],test_size=0.2,random_state=42)


from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc

RandomForestClassifier()

RandomForestClassifier()


#training the model
rfc.fit(x_train,y_train)
#training accuracy
rfc.score(x_train,y_train)

1.0


rfc_pred = rfc.predict(x_test)


#confusion matrix heatmap
from sklearn.metrics import confusion_matrix
sns.heatmap(confusion_matrix(y_test,rfc_pred),annot=True)
plt.ylabel('Predicted')
plt.xlabel('Actual')
plt.show()


#distribution plot for the predicted and actual values
ax = sns.distplot(y_test,hist=False,label='Actual', color='r')
sns.distplot(rfc_pred,hist=False,label='Predicted',color='b',ax=ax)
plt.show()

C:\Users\DELL\AppData\Local\Temp\ipykernel_17804\748502899.py:2: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  ax = sns.distplot(y_test,hist=False,label='Actual', color='r')
C:\Users\DELL\AppData\Local\Temp\ipykernel_17804\748502899.py:3: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(rfc_pred,hist=False,label='Predicted',color='b',ax=ax)


from sklearn.metrics import classification_report
print(classification_report(y_test,rfc_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      1623
           1       0.97      1.00      0.99       534

    accuracy                           0.99      2157
   macro avg       0.99      0.99      0.99      2157
weighted avg       0.99      0.99      0.99      2157


from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


print('Accuracy Score : ' + str(accuracy_score(y_test,rfc_pred)))
print('Precision Score : ' + str(precision_score(y_test,rfc_pred)))
print('Recall Score : ' + str(recall_score(y_test,rfc_pred)))
print('F1 Score : ' + str(f1_score(y_test,rfc_pred)))

Accuracy Score : 0.9925822902178952
Precision Score : 0.9743589743589743
Recall Score : 0.9962546816479401
F1 Score : 0.9851851851851853


df_new = pd.read_csv('datatest2.csv')
df_new.head()


#dropping columns humidity, date and humidity ratio
df_new.drop(['Humidity','date','HumidityRatio'],axis=1,inplace=True)


#splitting the target variable
x = df_new.drop(['Occupancy'],axis=1)
y = df_new['Occupancy']


#predicting the values
pred = rfc.predict(x)


#confusion matrix heatmap
sns.heatmap(confusion_matrix(y,pred),annot=True)
plt.ylabel('Predicted')
plt.xlabel('Actual')
plt.show()


#distribution plot for the predicted and actual values
ax = sns.distplot(y,hist=False,label='Actual', color='r')
sns.distplot(pred,hist=False,label='Predicted',color='b',ax=ax)
plt.show()

C:\Users\DELL\AppData\Local\Temp\ipykernel_17804\4080147354.py:2: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  ax = sns.distplot(y,hist=False,label='Actual', color='r')
C:\Users\DELL\AppData\Local\Temp\ipykernel_17804\4080147354.py:3: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  sns.distplot(pred,hist=False,label='Predicted',color='b',ax=ax)


print(classification_report(y,pred))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      7703
           1       0.96      0.99      0.97      2049

    accuracy                           0.99      9752
   macro avg       0.98      0.99      0.98      9752
weighted avg       0.99      0.99      0.99      9752


print('Accuracy Score : ' + str(accuracy_score(y,pred)))
print('Precision Score : ' + str(precision_score(y,pred)))
print('Recall Score : ' + str(recall_score(y,pred)))
print('F1 Score : ' + str(f1_score(y,pred)))

Accuracy Score : 0.9883100902379
Precision Score : 0.9565832940066069
Recall Score : 0.9892630551488532
F1 Score : 0.9726487523992322

Column Position	Atrribute Name	Definition	Data Type	Example
1	Date	Date & time in year-month-day hour:minute:second format	Qualitative	2/4/2015 17:57, 2/4/2015 17:55, 2/4/2015 18:06
2	Temperature	Temperature in degree Celcius	Quantitative	23.150, 23.075, 22.890
3	Humidity	Relative humidity in percentage	Quantitative	27.272000, 27.200000, 27.390000
4	Light	Illuminance measurement in unit Lux	Quantitative	426.0, 419.0, 0.0
5	CO2	CO2 in parts per million (ppm)	Quantitative	489.666667, 495.500000, 534.500000
6	HumidityRatio	Humadity ratio: Derived quantity from temperature and relative humidity, in kgwater-vapor/kg-air	Quantitative	0.004986, 0.005088, 0.005203
7	Occupancy	Occupied or not: 1 for occupied and 0 for not occupied	Quantitative	1, 0

	date	Temperature	Humidity	Light	CO2	HumidityRatio	Occupancy
0	2/2/2015 14:19	23.7000	26.272	585.200000	749.200000	0.004764	1
1	2/2/2015 14:19	23.7180	26.290	578.400000	760.400000	0.004773	1
2	2/2/2015 14:21	23.7300	26.230	572.666667	769.666667	0.004765	1
3	2/2/2015 14:22	23.7225	26.125	493.750000	774.750000	0.004744	1
4	2/2/2015 14:23	23.7540	26.200	488.600000	779.000000	0.004767	1

	date	Temperature	Humidity	Light	CO2	HumidityRatio	Occupancy
count	10781	10781.000000	10781.000000	10781.000000	10781.000000	10781.000000	10781.000000
mean	2015-02-06 13:41:14.581207808	20.821800	25.638618	138.036704	634.460328	0.003904	0.250533
min	2015-02-02 14:19:00	19.000000	16.745000	0.000000	412.750000	0.002674	0.000000
25%	2015-02-04 18:23:00	20.000000	21.390000	0.000000	441.000000	0.003323	0.000000
50%	2015-02-06 15:24:00	20.700000	25.680000	0.000000	464.000000	0.003805	0.000000
75%	2015-02-08 12:29:00	21.500000	28.323333	415.000000	763.000000	0.004373	1.000000
max	2015-02-10 09:33:00	24.408333	39.117500	1697.250000	2028.500000	0.006476	1.000000
std	NaN	1.078589	4.954838	212.330275	313.074686	0.000803	0.433340

	Temperature	Light	CO2	Occupancy
0	23.7000	585.200000	749.200000	1
1	23.7180	578.400000	760.400000	1
2	23.7300	572.666667	769.666667	1
3	23.7225	493.750000	774.750000	1
4	23.7540	488.600000	779.000000	1
5	23.7600	568.666667	790.000000	1
6	23.7300	536.333333	798.000000	1
7	23.7540	509.000000	797.000000	1
8	23.7540	476.000000	803.200000	1
9	23.7360	510.000000	809.000000	1

	date	Temperature	Humidity	Light	CO2	HumidityRatio	Occupancy
0	2/11/2015 14:48	21.7600	31.133333	437.333333	1029.666667	0.005021	1
1	2/11/2015 14:49	21.7900	31.000000	437.333333	1000.000000	0.005009	1
2	2/11/2015 14:50	21.7675	31.122500	434.000000	1003.750000	0.005022	1
3	2/11/2015 14:51	21.7675	31.122500	439.000000	1009.500000	0.005022	1
4	2/11/2015 14:51	21.7900	31.133333	437.333333	1005.666667	0.005030	1

Room Occupancy Detection¶

Data Dictionary¶

Data Preprocessing¶

Exploratory Data Analysis¶

Visualizing the temperture fluctuations over time¶

Visualizing the humidity fluctuations over time¶

Visualizing the light fluctuations over time¶

Visualizing the CO2 fluctuations over time¶

Visualizing the humidity ratio fluctuations over time¶

Correlation between the variables¶

Correlation Heatmap¶

Temperature and Occupancy¶

Light and Occupancy¶

CO2 and Occupancy¶

Data Preprocessing 2¶

Train Test Split¶

Model Building¶

Random Tree Classifier¶

Training the model¶

Model Evaluation¶

Testing the model on new dataset¶

Conclusion¶